PDP_example

data_folder <- "Data/Bike-Sharing-Dataset"
files <- list.files(data_folder, full.names = T)
files
[1] "Data/Bike-Sharing-Dataset/day.csv"   
[2] "Data/Bike-Sharing-Dataset/hour.csv"  
[3] "Data/Bike-Sharing-Dataset/Readme.txt"
day <- read_csv(files[[1]])
hour <- read_csv(files[[2]])
day
# A tibble: 731 x 16
   instant dteday     season    yr  mnth holiday weekday workingday
     <dbl> <date>      <dbl> <dbl> <dbl>   <dbl>   <dbl>      <dbl>
 1       1 2011-01-01      1     0     1       0       6          0
 2       2 2011-01-02      1     0     1       0       0          0
 3       3 2011-01-03      1     0     1       0       1          1
 4       4 2011-01-04      1     0     1       0       2          1
 5       5 2011-01-05      1     0     1       0       3          1
 6       6 2011-01-06      1     0     1       0       4          1
 7       7 2011-01-07      1     0     1       0       5          1
 8       8 2011-01-08      1     0     1       0       6          0
 9       9 2011-01-09      1     0     1       0       0          0
10      10 2011-01-10      1     0     1       0       1          1
# … with 721 more rows, and 8 more variables: weathersit <dbl>,
#   temp <dbl>, atemp <dbl>, hum <dbl>, windspeed <dbl>, casual <dbl>,
#   registered <dbl>, cnt <dbl>
day %>% skimr::skim()
Data summary
Name Piped data
Number of rows 731
Number of columns 16
_______________________
Column type frequency:
Date 1
numeric 15
________________________
Group variables None

Variable type: Date

skim_variable n_missing complete_rate min max median n_unique
dteday 0 1 2011-01-01 2012-12-31 2012-01-01 731

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
instant 0 1 366.00 211.17 1.00 183.50 366.00 548.50 731.00 ▇▇▇▇▇
season 0 1 2.50 1.11 1.00 2.00 3.00 3.00 4.00 ▇▇▁▇▇
yr 0 1 0.50 0.50 0.00 0.00 1.00 1.00 1.00 ▇▁▁▁▇
mnth 0 1 6.52 3.45 1.00 4.00 7.00 10.00 12.00 ▇▅▅▅▇
holiday 0 1 0.03 0.17 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
weekday 0 1 3.00 2.00 0.00 1.00 3.00 5.00 6.00 ▇▃▃▃▇
workingday 0 1 0.68 0.47 0.00 0.00 1.00 1.00 1.00 ▃▁▁▁▇
weathersit 0 1 1.40 0.54 1.00 1.00 1.00 2.00 3.00 ▇▁▅▁▁
temp 0 1 0.50 0.18 0.06 0.34 0.50 0.66 0.86 ▂▇▇▇▅
atemp 0 1 0.47 0.16 0.08 0.34 0.49 0.61 0.84 ▂▇▆▇▂
hum 0 1 0.63 0.14 0.00 0.52 0.63 0.73 0.97 ▁▁▆▇▂
windspeed 0 1 0.19 0.08 0.02 0.13 0.18 0.23 0.51 ▃▇▅▁▁
casual 0 1 848.18 686.62 2.00 315.50 713.00 1096.00 3410.00 ▇▆▂▁▁
registered 0 1 3656.17 1560.26 20.00 2497.00 3662.00 4776.50 6946.00 ▂▅▇▅▃
cnt 0 1 4504.35 1937.21 22.00 3152.00 4548.00 5956.00 8714.00 ▂▅▇▅▃
day %>% ggplot() + geom_point(aes(x = atemp, y = cnt, color = factor(season))) + 
    theme_bw()

cats <- day %>% map(~length(unique(.))) %>% cbind %>% data.frame() %>% rownames_to_column() %>% 
    unnest %>% setNames(nm = c("var", "n"))

vars_implicit_catgory <- cats$var[cats$n < 5]
vars_implicit_catgory
[1] "season"     "yr"         "holiday"    "workingday" "weathersit"
lapply(vars_implicit_catgory, function(v) {
    day %>% mutate(`:=`(!!sym(v), factor(!!sym(v)))) %>% ggplot() + geom_violin(aes_string(x = v, 
        y = "cnt", color = v), fill = "gray80", alpha = 0.1) + geom_jitter(aes_string(x = v, 
        y = "cnt", color = v), alpha = 0.2, width = 0.1) + theme_light()
})
[[1]]


[[2]]


[[3]]


[[4]]


[[5]]

day <- day %>% mutate(season = factor(season), holiday = factor(holiday), workingday = factor(workingday), 
    weathersit = factor(weathersit)) %>% mutate(days_since_2011 = lubridate::interval(lubridate::ymd("2011-01-01"), 
    dteday)/lubridate::ddays(1))

set.seed(123)
ind_tr <- (1:nrow(day))[sample(x = c(T, F), size = nrow(day), prob = c(0.7, 
    0.3), replace = T)]
ind_te <- setdiff(1:nrow(day), ind_tr)

day_m_tr <- day[ind_tr, ] %>% mutate(split = "train")
day_m_te <- day[ind_te, ] %>% mutate(split = "test")

unused_cols <- c("casual", "registered", "instant", "dteday", "yr", "mnth", 
    "weekday", "split")
lm_01 <- model <- step(lm(formula = cnt ~ ., data = day_m_tr %>% select(-one_of(unused_cols))), 
    direction = "both")
Start:  AIC=6886.74
cnt ~ season + holiday + workingday + weathersit + temp + atemp + 
    hum + windspeed + days_since_2011

                  Df Sum of Sq       RSS    AIC
- atemp            1       966 354721851 6884.7
<none>                         354720885 6886.7
- workingday       1   2070471 356791356 6887.7
- holiday          1   6919410 361640294 6894.6
- temp             1   9353511 364074395 6898.0
- hum              1  18451116 373172001 6910.6
- windspeed        1  26023229 380744114 6920.8
- weathersit       2  34273585 388994470 6929.8
- season           3  58546210 413267095 6958.6
- days_since_2011  1 435762280 790483165 7293.4

Step:  AIC=6884.74
cnt ~ season + holiday + workingday + weathersit + temp + hum + 
    windspeed + days_since_2011

                  Df Sum of Sq       RSS    AIC
<none>                         354721851 6884.7
- workingday       1   2071470 356793321 6885.7
+ atemp            1       966 354720885 6886.7
- holiday          1   6921957 361643808 6892.6
- hum              1  18510236 373232087 6908.7
- windspeed        1  26960785 381682636 6920.1
- weathersit       2  34356488 389078339 6927.9
- season           3  59877458 414599309 6958.3
- temp             1 157523827 512245678 7070.2
- days_since_2011  1 435876229 790598080 7291.5
lm_01

Call:
lm(formula = cnt ~ season + holiday + workingday + weathersit + 
    temp + hum + windspeed + days_since_2011, data = day_m_tr %>% 
    select(-one_of(unused_cols)))

Coefficients:
    (Intercept)          season2          season3          season4  
        1644.81           762.63          -129.44           171.63  
       holiday1      workingday1      weathersit2      weathersit3  
        -717.59           145.13          -347.78         -2015.12  
           temp              hum        windspeed  days_since_2011  
        5517.46         -1897.10         -3227.30             4.95  
lm_results <- list(day_m_tr, day_m_te) %>% bind_rows %>% group_by(split) %>% 
    nest %>% mutate(pred = map(data, ~predict(object = lm_01, newdata = .))) %>% 
    mutate(data = map2(data, pred, ~bind_cols(data, pred))) %>% select(data) %>% 
    unnest %>% ungroup

lm_results %>% ggplot() + geom_point(aes(x = cnt, y = V1, color = split, shape = split), 
    size = 3, alpha = 0.3) + theme_bw() + scale_color_aaas() + theme(axis.text.x = element_text(size = 12), 
    axis.title.x = element_text(size = 14), axis.text.y = element_text(size = 12), 
    axis.title.y = element_text(size = 14), legend.background = element_rect(color = "gray70"), 
    legend.position = c(0, 1), legend.justification = c(0, 1), legend.text = element_text(size = 12), 
    legend.title = element_text(size = 14)) + ylab("predicted") + xlab("count")

library(randomForest)
rf_01 <- model <- randomForest(cnt ~ ., data = day_m_tr %>% select(-one_of(unused_cols)), 
    importance = TRUE)

tr_new <- data.frame(cnt = day_m_tr$cnt, model.matrix(cnt ~ ., data = day_m_tr %>% 
    select(-one_of(unused_cols))) %>% data.frame %>% select(-1)) %>% add_column(split = "train", 
    .before = 1)
te_new <- data.frame(cnt = day_m_te$cnt, model.matrix(cnt ~ ., data = day_m_te %>% 
    select(-one_of(unused_cols))) %>% data.frame %>% select(-1)) %>% add_column(split = "test", 
    .before = 1)

rf_02 <- randomForest(cnt ~ ., data = tr_new %>% select(-split), importance = T)

rf_results_01 <- bind_rows(day_m_tr, day_m_te) %>% group_by(split) %>% nest %>% 
    mutate(pred = map(data, ~predict(rf_01, newdata = .))) %>% mutate(data = map2(data, 
    pred, ~bind_cols(data, pred))) %>% select(data) %>% unnest %>% ungroup

rf_results_02 <- bind_rows(tr_new, te_new) %>% group_by(split) %>% nest %>% 
    mutate(pred = map(data, ~predict(rf_02, newdata = .))) %>% mutate(data = map2(data, 
    pred, ~bind_cols(data, pred))) %>% select(data) %>% unnest %>% ungroup

rmse_rf_01 <- rf_results_01 %>% group_by(split) %>% summarize(a = sqrt(mean((V1 - 
    cnt)^2)))
rmse_rf_01
# A tibble: 2 x 2
  split     a
  <chr> <dbl>
1 test   663.
2 train  321.
rmse_rf_02 <- rf_results_02 %>% group_by(split) %>% summarize(a = sqrt(mean((V1 - 
    cnt)^2)))
rmse_rf_02
# A tibble: 2 x 2
  split     a
  <chr> <dbl>
1 test   670.
2 train  330.
rmse_lm <- lm_results %>% group_by(split) %>% summarize(a = sqrt(mean((V1 - 
    cnt)^2)))
rmse_lm
# A tibble: 2 x 2
  split     a
  <chr> <dbl>
1 test   993.
2 train  834.
rf_results_01 %>% ggplot() + geom_point(aes(x = cnt, y = V1, color = split, 
    fill = split, shape = split), size = 3, alpha = 0.3) + theme_bw() + scale_color_aaas() + 
    theme(legend.position = c(0, 1), legend.justification = c(0, 1), legend.background = element_rect(color = "gray70"), 
        legend.text = element_text(size = 12), legend.title = element_text(size = 14), 
        axis.text.x = element_text(size = 12), axis.title.x = element_text(size = 14), 
        axis.text.y = element_text(s = 12), axis.title.y = element_text(s = 14)) + 
    ylab("predicted") + xlab("y")

rf_imp <- sort(randomForest::importance(rf_01, type = 1, scale = FALSE)[, 1], 
    decreasing = T)
rf_imp %>% data.frame(importance = .) %>% rownames_to_column("feature") %>% 
    ggplot() + geom_bar(aes(x = factor(feature, levels = names(rf_imp)), y = importance), 
    stat = "identity", fill = "skyblue3", color = "skyblue4") + theme_bw() + 
    theme(axis.title.x = element_text(size = 14), axis.text.x = element_text(size = 12, 
        angle = -90), axis.title.y = element_text(size = 14), axis.text.y = element_text(size = 12)) + 
    xlab("factor")

library(pdp)

ice_rf <- partial(object = rf_01, pred.var = "temp", ice = TRUE)
plotPartial(ice_rf)

Customized plot function

plotICE <- function(model, feature, center) {
    obj <- partial(object = model, pred.var = feature, ice = TRUE, center = center)
    mean_obj <- obj %>% group_by(!!sym(feature)) %>% summarize(yhat = mean(yhat))
    ggplot(obj) + geom_line(aes(x = !!sym(feature), y = yhat, group = yhat.id), 
        alpha = 0.2) + geom_line(aes(x = !!sym(feature), y = yhat), color = "dodgerblue", 
        size = 1.2, data = mean_obj) + theme_bw() + theme(axis.title.x = element_text(size = 14), 
        axis.title.y = element_text(size = 14), axis.text.x = element_text(size = 12), 
        axis.text.y = element_text(size = 12))
}

plotPDP <- function(model, feature, target_var, data) {
    obj <- partial(object = model, pred.var = feature, ice = F)
    mean_y <- data[[target_var]] %>% mean
    lowess_df <- lowess(x = day_m_tr[[feature]], y = day_m_tr[[target_var]]) %>% 
        data.frame()
    ggplot() + geom_line(aes(x = !!sym(feature), y = yhat), size = 1.2, color = "dodgerblue", 
        data = obj) + geom_point(aes(x = !!sym(feature), y = !!sym(target_var)), 
        alpha = 0.2, data = data) + geom_line(aes(x = x, y = y), data = lowess_df, 
        color = "darkorange", size = 1.2) + geom_abline(intercept = mean_y, 
        slope = 0, color = "gray60", linetype = 2) + theme_bw() + theme(axis.title.x = element_text(size = 14), 
        axis.title.y = element_text(size = 14), axis.text.x = element_text(size = 12), 
        axis.text.y = element_text(size = 12))
    
}
  • ICE (not centered)
p1 <- plotICE(rf_01, "days_since_2011", F)
p2 <- plotICE(rf_01, "temp", F)
p3 <- plotICE(rf_01, "atemp", F)
p4 <- plotICE(rf_01, "hum", F)

gridExtra::grid.arrange(p1, p2, p3, p4, nrow = 2)

  • ICE (centered)
p5 <- plotICE(rf_01, "days_since_2011", T)
p6 <- plotICE(rf_01, "temp", T)
p7 <- plotICE(rf_01, "atemp", T)
p8 <- plotICE(rf_01, "hum", T)

gridExtra::grid.arrange(p5, p6, p7, p8, nrow = 2)

  • PDP (+ lowess)
p1 <- plotPDP(rf_01, "days_since_2011", "cnt", day_m_tr)
p2 <- plotPDP(rf_01, "temp", "cnt", day_m_tr)
p3 <- plotPDP(rf_01, "atemp", "cnt", day_m_tr)
p4 <- plotPDP(rf_01, "hum", "cnt", day_m_tr)

gridExtra::grid.arrange(p1, p2, p3, p4, nrow = 2)

  • PDP (3D)
pdp2 <- partial(object = rf_01, pred.var = c("days_since_2011", "temp"), chull = T)
plotPartial(pdp2)

plotPartial(pdp2, levelplot = F, drape = T, screen = list(z = -20, x = -70))

tmyst

2020-02-24